1 Data Cleaning

## Parsed with column specification:
## cols(
##   user_id = col_character(),
##   program_id = col_character(),
##   series_id = col_character(),
##   genre = col_character(),
##   start_date_time = col_datetime(format = ""),
##   streaming_id = col_character(),
##   prog_duration_min = col_double(),
##   time_viewed_min = col_double(),
##   duration_more_30s = col_double(),
##   time_viewed_more_5s = col_double(),
##   percentage_program_viewed = col_double(),
##   watched_more_60_percent = col_double(),
##   month = col_double(),
##   day = col_double(),
##   hour = col_double(),
##   weekend = col_character(),
##   time_of_day = col_character()
## )
## Observations: 313,256
## Variables: 17
## $ user_id                   <chr> "cd2006", "cd2006", "cd2006", "cd2006", "cd…
## $ program_id                <chr> "b8fbf2", "e2f113", "0e0916", "ca03b9", "cf…
## $ series_id                 <chr> "e0480e", "933a1b", "b68e79", "5d0813", "eb…
## $ genre                     <chr> "Comedy", "Factual", "Entertainment", "Spor…
## $ start_date_time           <dttm> 2017-01-19 22:17:04, 2017-02-14 19:12:36, …
## $ streaming_id              <chr> "1484864257965_1", "1487099603980_1", "1484…
## $ prog_duration_min         <dbl> 1.850000, 0.500000, 1.366667, 1.616667, 8.5…
## $ time_viewed_min           <dbl> 1.85000000, 0.49908333, 1.36666667, 1.61666…
## $ duration_more_30s         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ time_viewed_more_5s       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ percentage_program_viewed <dbl> 1.000000000, 0.998166667, 1.000000000, 1.00…
## $ watched_more_60_percent   <dbl> 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0…
## $ month                     <dbl> 1, 2, 1, 2, 3, 2, 4, 3, 4, 3, 3, 4, 3, 3, 2…
## $ day                       <dbl> 5, 3, 4, 1, 1, 3, 1, 6, 7, 7, 1, 5, 7, 3, 2…
## $ hour                      <dbl> 22, 19, 21, 14, 20, 19, 20, 21, 21, 20, 18,…
## $ weekend                   <chr> "weekday", "weekday", "weekday", "weekend",…
## $ time_of_day               <chr> "Evening", "Evening", "Evening", "Day", "Ev…
## cleaned_BBC_Data 
## 
##  17  Variables      313256  Observations
## --------------------------------------------------------------------------------
## user_id 
##        n  missing distinct 
##   313256        0     9634 
## 
## lowest : 0001c6 000c1a 001c53 001d44 002b2e, highest: ffec33 ffec4c fff1fd fffd0b ffffca
## --------------------------------------------------------------------------------
## program_id 
##        n  missing distinct 
##   313256        0    20772 
## 
## lowest : 0.00E+00 00045f   0006cf   0006ef   000f59  
## highest: ffe7df   ffef0a   fff417   fff6de   fffca7  
## --------------------------------------------------------------------------------
## series_id 
##        n  missing distinct 
##   313256        0     4246 
## 
## lowest : 001b96 00389f 00400b 00434d 0079b7, highest: ff3bfd ff737e ff9060 ff96ec ffb887
## --------------------------------------------------------------------------------
## genre 
##        n  missing distinct 
##   313256        0       12 
## 
## Children (16294, 0.052), Comedy (28110, 0.090), Drama (77342, 0.247),
## Entertainment (24257, 0.077), Factual (88001, 0.281), Learning (1952, 0.006),
## Music (3924, 0.013), News (47448, 0.151), NoGenre (719, 0.002), RelEthics
## (1077, 0.003), Sport (17440, 0.056), Weather (6692, 0.021)
## --------------------------------------------------------------------------------
## start_date_time 
##                   n             missing            distinct                Info 
##              313256                   0              300690                   1 
##                Mean                 Gmd                 .05                 .10 
## 2017-02-26 23:53:32 1970-02-10 14:11:42 2017-01-05 22:32:41 2017-01-11 05:42:28 
##                 .25                 .50                 .75                 .90 
## 2017-01-27 09:19:00 2017-02-24 21:01:39 2017-03-29 18:00:58 2017-04-18 12:15:17 
##                 .95 
## 2017-04-24 19:14:02 
## 
## lowest : 2017-01-01 00:00:01 2017-01-01 00:00:03 2017-01-01 00:00:04 2017-01-01 00:00:05 2017-01-01 00:00:06
## highest: 2017-04-30 22:51:33 2017-04-30 22:52:27 2017-04-30 22:53:18 2017-04-30 22:53:51 2017-04-30 22:55:43
## --------------------------------------------------------------------------------
## streaming_id 
##        n  missing distinct 
##   313256        0   268031 
## 
## lowest : 1482085634977_2 1482088915673_2 1482224444938_1 1482432425783_2 1482620908113_1
## highest: 1493593235133_3 1493593282442_1 1493593664011_2 1493593706256_1 1493677079660_1
## --------------------------------------------------------------------------------
## prog_duration_min 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   313256        0     1143    0.985    49.63    39.73      0.5      5.0 
##      .25      .50      .75      .90      .95 
##     30.0     45.0     60.0     85.0    120.0 
## 
## lowest :   0.5000000   0.5166667   0.5166667   0.5333333   0.5333333
## highest: 480.0000000 485.0000000 500.0000000 510.0000000 720.0000000
## --------------------------------------------------------------------------------
## time_viewed_min 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   313256        0   219956        1    18.02    22.42   0.1264   0.1662 
##      .25      .50      .75      .90      .95 
##   1.0177  10.0007  28.0094  57.2412  59.0043 
## 
## lowest :   0.08333333   0.08333333   0.08335000   0.08336667   0.08336667
## highest: 279.66650000 290.00000000 321.49550000 335.00000000 355.00000000
## --------------------------------------------------------------------------------
## duration_more_30s 
##        n  missing distinct     Info     Mean      Gmd 
##   313256        0        1        0        1        0 
##                  
## Value           1
## Frequency  313256
## Proportion      1
## --------------------------------------------------------------------------------
## time_viewed_more_5s 
##        n  missing distinct     Info     Mean      Gmd 
##   313256        0        1        0        1        0 
##                  
## Value           1
## Frequency  313256
## Proportion      1
## --------------------------------------------------------------------------------
## percentage_program_viewed 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   313256        0   250689    0.998   0.4624   0.4497 0.002671 0.006227 
##      .25      .50      .75      .90      .95 
## 0.053571 0.313905 0.966617 1.000000 1.000000 
## 
## lowest : 0.0001351620 0.0001606250 0.0001791667 0.0001928704 0.0001960417
## highest: 0.9999998039 0.9999998148 0.9999998611 0.9999999000 1.0000000000
## --------------------------------------------------------------------------------
## watched_more_60_percent 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##   313256        0        2    0.711   120913    0.386    0.474 
## 
## --------------------------------------------------------------------------------
## month 
##        n  missing distinct     Info     Mean      Gmd 
##   313256        0        4    0.935    2.398    1.275 
##                                   
## Value          1     2     3     4
## Frequency  94268 72745 73540 72703
## Proportion 0.301 0.232 0.235 0.232
## --------------------------------------------------------------------------------
## day 
##        n  missing distinct     Info     Mean      Gmd 
##   313256        0        7    0.979    3.841    2.338 
##                                                     
## Value          1     2     3     4     5     6     7
## Frequency  55897 44933 44470 43073 40814 40671 43398
## Proportion 0.178 0.143 0.142 0.138 0.130 0.130 0.139
## --------------------------------------------------------------------------------
## hour 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   313256        0       24    0.996    15.38     6.84        1        6 
##      .25      .50      .75      .90      .95 
##       12       17       20       22       23 
## 
## lowest :  0  1  2  3  4, highest: 19 20 21 22 23
## --------------------------------------------------------------------------------
## weekend 
##        n  missing distinct 
##   313256        0        2 
##                           
## Value      weekday weekend
## Frequency   213961   99295
## Proportion   0.683   0.317
## --------------------------------------------------------------------------------
## time_of_day 
##        n  missing distinct 
##   313256        0        4 
##                                                   
## Value      Afternoon       Day   Evening     Night
## Frequency      46075     81102    135965     50114
## Proportion     0.147     0.259     0.434     0.160
## --------------------------------------------------------------------------------

The column descriptions are as follows.

  1. user_id – a unique identifier for the viewer

  2. program_id and series_id – these identify the program and the series that the program belongs to

  3. genre – the programme’s genre (e.g., drama, factual, news, sport, comedy, etc)

  4. start_date_time – the streaming start date/time of the event

  5. Streaming id – a unique identifier per streaming event

  6. prog_duration_min – the program duration in minutes

  7. time_viewed_min – how long the customer watched the program in minutes

  8. duration_more_30s - equals 1 if the program duration is more than 30 seconds, equals 0 otherwise

  9. time_viewed_more_5s - equals 1 if time_viewed is more than 5 seconds, equals 0 otherwise

  10. percentage_program_viewed – percentage of the program viewed

  11. watched_more_60_percent – equals 1 if more than 60% of the program is watched, equals 0 otherwise

  12. month, day, hour, weekend – timing of the viewing

  13. time_of_day – equals “Night” if the viewing occurs between 22 and 6AM, “Day” if it occurs between 6AM and 14, “Afternoon” if the it occurs between 14 and 17, “Evening” otherwise

Before we proceed let’s consider the usage in January only.

2 User based data

2.1 Data format

The data is presented to us in an event-based format (every row denotes the event that someone started streaming content from the iPlayer). Let’s change the current format to a customer-based dataset. In what dimensions could BBC iPlayer users be differentiated? Come up with variables that capture these from the data we are given.

2.2 Feature Engineering

  1. Total number of shows watched and ii. Total time spent watching shows on iPlayer by each user in the data
  1. Proportion of shows watched during the weekend for each user.
  1. Proportion of shows watched during different times of day for each user.

Find the proportion of shows watched in each genre by each user. Add it to the data frame userdatall and use head function to show first few rows.

## # A tibble: 6 x 21
##   user_id noShows total_Time weekday weekend Afternoon    Day Evening  Night
##   <chr>     <int>      <dbl>   <dbl>   <dbl>     <dbl>  <dbl>   <dbl>  <dbl>
## 1 0001c6        2     16.7     1       0             0 0        1     0     
## 2 000c1a        2      0.310   1       0             0 1        0     0     
## 3 001c53        2      1.87    1       0             0 1        0     0     
## 4 001d44        1      1.06    0       1             0 0        1     0     
## 5 002b2e       31    534.      0.871   0.129         0 0.0968   0.806 0.0968
## 6 002b43        1      1.31    1       0             0 0        1     0     
## # … with 12 more variables: Children <dbl>, Comedy <dbl>, Drama <dbl>,
## #   Entertainment <dbl>, Factual <dbl>, Learning <dbl>, Music <dbl>,
## #   News <dbl>, NoGenre <dbl>, RelEthics <dbl>, Sport <dbl>, Weather <dbl>

Add useful variable for differentiating viewers by creating new one with existing columns in the dataset.

## # A tibble: 6 x 22
##   user_id noShows total_Time weekday weekend Afternoon    Day Evening  Night
##   <chr>     <int>      <dbl>   <dbl>   <dbl>     <dbl>  <dbl>   <dbl>  <dbl>
## 1 0001c6        2     16.7     1       0             0 0        1     0     
## 2 000c1a        2      0.310   1       0             0 1        0     0     
## 3 001c53        2      1.87    1       0             0 1        0     0     
## 4 001d44        1      1.06    0       1             0 0        1     0     
## 5 002b2e       31    534.      0.871   0.129         0 0.0968   0.806 0.0968
## 6 002b43        1      1.31    1       0             0 0        1     0     
## # … with 13 more variables: Children <dbl>, Comedy <dbl>, Drama <dbl>,
## #   Entertainment <dbl>, Factual <dbl>, Learning <dbl>, Music <dbl>,
## #   News <dbl>, NoGenre <dbl>, RelEthics <dbl>, Sport <dbl>, Weather <dbl>,
## #   Loyalty <dbl>

Comments:

I added the variable called ‘fanatic’ which shows the loyalty of customers to the programme by calculating average percentage of program viewed by each customer because it is important property of users. It tells us if the customer is loyal to the programmes they have watched or not. I calculated this variable by adding each users’ percentage of program viewed and dividing it by number of shows.

3 Visualizing user-based data

Let’s visualize the information captured in the user based data. Let’s start with the correlations.

Observe the most correlated variables and the implication of this for the clustering.

The variables which are most correlated is noShow and total_Time. This is because if person watches higher number of shows it naturally increases the time of watching programmes. Also, the weekend and weekday variables are completely negatively correlated, which means that people who watch the programmes on weekend usually don’t spend time on weekdays to watch the shows.

This strong correlations implicate that those variables would be eliminated from the analysis dataset to create reasonable clustering analysis. This is because if the variables in the analysis are highly correlated, collinearity problem comes up. If certain variables are highly correlated, its weight becomes twice higher than other variables, so it messes up the clustering.

Investigate the distribution of noShows and total_Time using box plots (geom_boxplot) and histograms (geom_histogram).

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Comments: In the box plot and the scatter plot, we can see that there are a huge number of outliers in this data in terms of the commitment of BBC iPlayer users. However, I believe that as the number of outliers are huge, it can be also a factor of determining separate clusters.

3.1 Delete infrequent users

Delete the records for users whose total view time is less than 5 minutes and who views 5 or fewer programs. These users are not very likely to be informative.

4 Clustering with K-Means

4.1 Training a K-Means Model

Start training a K-Means model. I initate with 2 clusters and made sure de-select user_id variable and scale the data. Set max iterations to 100 and use 50 random starts. Displayed the cluster siezes and used summary("kmeans Object") to examine the components of the results of the clustering algorithm. (3046 amd 161 points in each cluster)

##              Length Class  Mode   
## cluster      3207   -none- numeric
## centers        36   -none- numeric
## totss           1   -none- numeric
## withinss        2   -none- numeric
## tot.withinss    1   -none- numeric
## betweenss       1   -none- numeric
## size            2   -none- numeric
## iter            1   -none- numeric
## ifault          1   -none- numeric
## [1] 3045  162

4.2 Visualizing the results

4.2.1 Cluster centers

Plot the cluster centers. Describe the clusters that the algorithm suggests.

This two clusters basically show us two types of users who are using BBC iPlayer. Cluster 2 only contains 161 users out of 3207 users and it is mainly focused on people who are using the iPlayer for educational purposes, which we can see from the plot. The users in cluster 2 watches Children genre and Learning spending less time in total than people in cluster 1. They also uses this service during afternoon and day instead of night and evening, which means that this customers are mainly household that has children in their family. Thus, it is meaningful cluster in terms of that it differs customer with children in their household.

This cluster analysis can help BBC iPlayer management to see what type of programmes are popular to this particular type of customers with children. This type of clustering analysis can be useful for the marketing purposes and product improvement.

4.2.2 Clusters vs variables

Plot a scatter plot for the viewers with respect to total_Time and weekday variables with color set to the cluster number of the user. What do we observe? Which variable seems to play a more prominent role in determining the clusters of users? Add an additional variable that should play a role in cluster assignments using size option.

## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract

## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

It shows that ‘Weekday’ is not the variable that actually distinguishes the clusters 1 and 2. Meanwhile, ‘Children’ variable actually affects more to the cluster than ‘Weekday’. As we can see, the cluster 2 has all big size of points than cluster 1 does.

4.2.3 Clusters vs PCA components

Repeat the previous step and use the first two principle components using fviz_cluster function.

##          eigenvalue variance.percent cumulative.variance.percent
## Dim.1  1.771753e+00     9.843074e+00                    9.843074
## Dim.2  1.705520e+00     9.475111e+00                   19.318185
## Dim.3  1.523506e+00     8.463923e+00                   27.782108
## Dim.4  1.364902e+00     7.582789e+00                   35.364897
## Dim.5  1.316230e+00     7.312387e+00                   42.677283
## Dim.6  1.194488e+00     6.636045e+00                   49.313328
## Dim.7  1.147612e+00     6.375620e+00                   55.688948
## Dim.8  1.058451e+00     5.880281e+00                   61.569229
## Dim.9  1.005826e+00     5.587925e+00                   67.157154
## Dim.10 9.849678e-01     5.472043e+00                   72.629197
## Dim.11 9.285727e-01     5.158737e+00                   77.787934
## Dim.12 9.004910e-01     5.002728e+00                   82.790662
## Dim.13 8.003021e-01     4.446123e+00                   87.236785
## Dim.14 7.101773e-01     3.945429e+00                   91.182214
## Dim.15 6.305257e-01     3.502921e+00                   94.685134
## Dim.16 6.113851e-01     3.396584e+00                   98.081718
## Dim.17 3.452907e-01     1.918282e+00                  100.000000
## Dim.18 1.765332e-30     9.807400e-30                  100.000000

## Warning in fviz_pca_contrib(pca_bbc, choice = "var"): The function
## fviz_pca_contrib() is deprecated. Please use the function fviz_contrib() which
## can handle outputs of PCA, CA and MCA functions.

I am not worried about the outliers because I eliminated all the variables that are highly correlated including ‘Day’. I eliminated ‘Day’ variable because when I included it and conducted PCA, I could see that ‘Day’ and ‘Evening’ became the two highest contribution variables which doesn’t make a good sense of creating reasonable clusters. Now the two most contributable variables are ‘Children’ and ‘News’ to differ the clusters (we can see this on the PCA screep plot). Thus, I believe that the two clusters are not highly affected by outliers from ‘total time’ variable.

4.4 Silhouette method

Repeat the previous step for Silhouette analysis.

Summary of the conclusion from Elbow Chart and Silhouette analysis.

From the elbow chart, I can’t see the steep part where I can easily choose the proper number of clusters k. The slope decreases with the similar slope for any number of clusters. From silhouette method, I can see that cluster number 10 has the highest silhouette width. However, as the clusters overlap as the number of clusters increases, I would rather choose the number of clusters seeing the visualisation of clusters based on PCA.

4.5 Comparing k-means results with different k’s

## [1]  162 3045
## [1]  160  776 2271
## [1]  747  163  996 1301
## [1] 163 778 629 905 732

According to the 4 plots, I can see that there are certain cluster that does not change as the number of cluster increases. As the most discrete cluster doesn’t change, it would be nicer to choose K as number 3. This is because the other part (left upper part) overlaps as the number of cluster increases.

#Plot centers
xa<-data.frame(cluster=as.factor(c(1:2)),model_km2$centers)
xa2k3<-xa %>% gather(variable,value,-cluster,factor_key = TRUE)

graphknn2<-ggplot(xa2k3, aes(x = variable, y = value))+  geom_line(aes(color =cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
        axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=2")


#Plot centers for k=3
xa<-data.frame(cluster=as.factor(c(1:3)),model_km3$centers)
xa2k3<-xa %>% gather(variable,value,-cluster,factor_key = TRUE)

graphknn3<-ggplot(xa2k3, aes(x = variable, y = value))+  geom_line(aes(color =cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
        axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=3")


#Plot centers for k=4
xa<-data.frame(cluster=as.factor(c(1:4)),model_km4$centers)

xa4<-xa %>% gather(variable,value,-cluster,factor_key = TRUE)
graphknn4<-ggplot(xa4, aes(x = variable, y = value))+  geom_line(aes(color = cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
        axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=4")


#Plot centers for k=5
xa<-data.frame(cluster=as.factor(c(1:5)),model_km5$centers)

xa2<-xa %>% gather(variable,value,-cluster,factor_key = TRUE)
graphknn5<-ggplot(xa2, aes(x = variable, y = value))+  geom_line(aes(color = cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
        axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=5")


grid.arrange(graphknn2, graphknn3,graphknn4,graphknn5, nrow = 4)

If we see these 4 plots, it seems reasonable to set the ideal number of clusters as 4. Because this 4 clusters clearly shows the different segments of people such as people who are into Drama, Children & Learning, and News & Weather. Now I believe that I should have set the new variable more carefully, thus Loyalty variable does not contribute to differ the specific segment of customers. I personally think that the total_Time should have worked as a segment as well because some individuals are just spending a lot of time watching programmes without certain purposes or preferences. (e.g. TVs in the stores or people who just turn on the TV whole day)

5 Comparing results of different clustering algorithms

5.1 PAM

Fit a PAM model for the k value I chose above for k-means. Determine how many points each cluster has. Plot the centers of the clusters and produce the PCA visualization.

##      size max_diss  av_diss diameter separation
## [1,] 1340 21.32385 3.057657 24.97547  0.5570248
## [2,]  576 11.58818 3.552756 17.26607  1.0263983
## [3,] 1141 16.17630 4.061424 21.89103  0.5570248
## [4,]  150 15.31108 4.789563 21.04540  1.3882904

5.2 H-Clustering

Use Hierercahial clustering with the same k I chose above. Set hc_method equal to average and then ward.D to tell the difference between two methods. Visualize the results (called dendrogram) using plot function to observe the differences. How many points does each cluster have? Plot the centers of the clusters and produce PCA visualization for ward.D.

## [1] 3196    1    8    2
## [1] 1702  916  437  152
##             Length  Class  Mode     
## merge          6412 -none- numeric  
## height         3206 -none- numeric  
## order          3207 -none- numeric  
## labels            0 -none- NULL     
## method            1 -none- character
## call              3 -none- call     
## dist.method       1 -none- character
## cluster        3207 -none- numeric  
## nbclust           1 -none- numeric  
## silinfo           3 -none- list     
## size              4 -none- numeric  
## data        5140821 dist   numeric
##   cluster size ave.sil.width
## 1       1 3196          0.58
## 2       2    1          0.00
## 3       3    8          0.61
## 4       4    2          0.90

##             Length  Class  Mode     
## merge          6412 -none- numeric  
## height         3206 -none- numeric  
## order          3207 -none- numeric  
## labels            0 -none- NULL     
## method            1 -none- character
## call              3 -none- call     
## dist.method       1 -none- character
## cluster        3207 -none- numeric  
## nbclust           1 -none- numeric  
## silinfo           3 -none- list     
## size              4 -none- numeric  
## data        5140821 dist   numeric
##   cluster size ave.sil.width
## 1       1 1702          0.13
## 2       2  916         -0.07
## 3       3  437          0.16
## 4       4  152          0.16

Plot the centers of H-clusters and compare the results with K-Means and PAM.

Comments: HD clustering and Kmeans both correclt identify the ‘children’ category. In HD clustering the “news/sport/weather” type of user is still visible and it is not characterized anymore by a specific time of the day compared to Kmeans where ‘Day’ shows high reading. The category “spend lot of time in forn of TV and has no preferences in term of genre” is there using both clustering methods. Based on the three methods above, I can see that the segments are well differed by its genres. I can see that the clusters of PAM and K-means are very similar and the H-clust shows fairly differen results

6 Subsample check

We have chosen the number of clusters by now. We will try to reinforce the conclusions and verify that they are not due to chance by dividing the data into two equal parts. Use K-means clustering, fixing the number of clusters, at find the clusters in these two data sets separately. If we get similar looking clusters, we can rest assured that our conclusions are robust. If not we might want to reconsider the decision.

set.seed(1234)
library(rsample)
train_test_split <- initial_split(userdatall2, prop = 0.5)
testing <- testing(train_test_split) #50% of the data is set aside for testing
training <- training(train_test_split) #50% of the data is set aside for training

#Fit k-means to each data set
model_testing <- kmeans(testing, centers = 4, nstart = 50, iter.max = 100)
model_training <- kmeans(training, centers = 4, nstart = 50, iter.max =100)

#Plot centers
userdatall_withClusters <- mutate(userdatall_withClusters, cluster_pam = as.factor(k4_pam$clustering))
center_locations <- userdatall_withClusters %>% group_by(cluster_pam) %>% 
  summarise_at(vars(total_Time:Loyalty),mean)

#Plot for training data
training_center <-data.frame(cluster=as.factor(c(1:4)),model_training$centers)
training_center <-training_center %>% gather(variable,value,-cluster,factor_key = TRUE)

training_plot<-ggplot(training_center, aes(x = variable, y = value))+  geom_line(aes(color =cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
        axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=4")

#Plot for testing data
testing_center <-data.frame(cluster=as.factor(c(1:4)),model_testing$centers)
testing_center <-testing_center %>% gather(variable,value,-cluster,factor_key = TRUE)

testing_plot<-ggplot(testing_center, aes(x = variable, y = value))+  geom_line(aes(color =cluster,group = cluster), linetype = "dashed",size=1)+ geom_point(size=1,shape=4)+geom_hline(yintercept=0)+theme(text = element_text(size=10),
        axis.text.x = element_text(angle=45, hjust=1),legend.title=element_text(size=5),legend.text = element_text(size=5))+ggtitle("K-means Centers k=4")

training_plot

7 Conclusions

I think I have chosen the right number of k because it shows the consistent results with the testing and training dataset. Also, from the plots I have created, I could observe that the 4 clusters showed different preferences for the genres especially for the Children & Learning genres. I have done good job at finding people who are dedicated to certain genres. However, I could have improved my analysis by adding new variable to make a separate cluster for the people who just spend way more time than others who are fanatic to BBC iPlayer. Based on this data, BBC can improve their programme qualities conducting separate analysis by user segments (Learning&Children, Drama, News&Weather)